soil_df <- fread("../Soil_Nutrient_Year.csv", header = T, stringsAsFactors = F, showProgress = T)
##
Read 36.5% of 685480 rows
Read 55.4% of 685480 rows
Read 74.4% of 685480 rows
Read 90.4% of 685480 rows
Read 685480 rows and 19 (of 19) columns from 0.081 GB file in 00:00:06
soil_df$V1 <- NULL
soil_df$SampleNo <- NULL
table(is.na(soil_df$SoilPh))
##
## FALSE TRUE
## 679161 6319
table(soil_df$SoilPh>14) # 1652 records have incorrect data (<1%)
##
## FALSE TRUE
## 677509 1652
6319 is missing data (<1%).
1652 rows have incorrect data, >14.
Average value: {mean(soil_df$SoilPh, na.rm = T)}
ggplotly(ggplot(data=soil_df, aes(x=Year, y=SoilPh, fill=SoilPh)) + geom_boxplot())
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Warning: Removed 6319 rows containing non-finite values (stat_boxplot).